# Import required R libraries
library(fpp3)
library(tidyverse)
library(readxl)
library(seasonal)
This project consists of 3 parts - two required and one bonus and is worth 15% of your grade. The project is due at 11:59 PM on Sunday October 31. I will accept late submissions with a penalty until the meetup after that when we review some projects.
In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable Cash is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an Excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file. Also please submit the forecast which you will put in an Excel readable file.
# Read in data
atm_data_raw <- read_excel("data/ATM624Data.xlsx")
# Properly convert the DATE column to match true input
# https://stackoverflow.com/questions/43230470/how-to-convert-excel-date-format-to-proper-date-in-r
atm_data_raw <- atm_data_raw %>% mutate(DATE = as.Date(DATE, origin = "1899-12-30"))
# Initial output to see data
head(atm_data_raw)
## # A tibble: 6 × 3
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 96
## 2 2009-05-01 ATM2 107
## 3 2009-05-02 ATM1 82
## 4 2009-05-02 ATM2 89
## 5 2009-05-03 ATM1 85
## 6 2009-05-03 ATM2 90
# Output summary for high level assessment
summary(atm_data_raw)
## DATE ATM Cash
## Min. :2009-05-01 Length:1474 Min. : 0.0
## 1st Qu.:2009-08-01 Class :character 1st Qu.: 0.5
## Median :2009-11-01 Mode :character Median : 73.0
## Mean :2009-10-31 Mean : 155.6
## 3rd Qu.:2010-02-01 3rd Qu.: 114.0
## Max. :2010-05-14 Max. :10919.8
## NA's :19
# Check dimensions to understand breadth of data
dim(atm_data_raw)
## [1] 1474 3
# 1474 3
#Cash in hundreds
# Define as tsibble with DATE as index
atm_data_ts <- atm_data_raw %>%
as_tsibble(index = DATE, key = ATM)
# Output tsibble to confirm proper format and definition
atm_data_ts
## # A tsibble: 1,474 x 3 [1D]
## # Key: ATM [5]
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 96
## 2 2009-05-02 ATM1 82
## 3 2009-05-03 ATM1 85
## 4 2009-05-04 ATM1 90
## 5 2009-05-05 ATM1 99
## 6 2009-05-06 ATM1 88
## 7 2009-05-07 ATM1 8
## 8 2009-05-08 ATM1 104
## 9 2009-05-09 ATM1 87
## 10 2009-05-10 ATM1 93
## # … with 1,464 more rows
# Filter to ATM1 data
atm1_data_ts <- atm_data_ts %>%
filter(ATM == 'ATM1')
# Output summary
summary(atm1_data_ts)
## DATE ATM Cash
## Min. :2009-05-01 Length:365 Min. : 1.00
## 1st Qu.:2009-07-31 Class :character 1st Qu.: 73.00
## Median :2009-10-30 Mode :character Median : 91.00
## Mean :2009-10-30 Mean : 83.89
## 3rd Qu.:2010-01-29 3rd Qu.:108.00
## Max. :2010-04-30 Max. :180.00
## NA's :3
#atm1_data_ts
atm1_data_ts %>%
autoplot(Cash)
# Calculate median value for ATM1
median <- median(atm1_data_ts$Cash, na.rm=TRUE)
# Set NAs to median
atm1_data_ts$Cash[is.na(atm1_data_ts$Cash)] <- median
summary(atm1_data_ts)
## DATE ATM Cash
## Min. :2009-05-01 Length:365 Min. : 1.00
## 1st Qu.:2009-07-31 Class :character 1st Qu.: 73.00
## Median :2009-10-30 Mode :character Median : 91.00
## Mean :2009-10-30 Mean : 83.95
## 3rd Qu.:2010-01-29 3rd Qu.:108.00
## Max. :2010-04-30 Max. :180.00
head(atm1_data_ts)
## # A tsibble: 6 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 96
## 2 2009-05-02 ATM1 82
## 3 2009-05-03 ATM1 85
## 4 2009-05-04 ATM1 90
## 5 2009-05-05 ATM1 99
## 6 2009-05-06 ATM1 88
atm1_data_ts <- atm1_data_ts %>%
mutate(DATE = as_date(DATE)) %>%
update_tsibble(index = DATE)
atm1_data_ts
## # A tsibble: 365 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 96
## 2 2009-05-02 ATM1 82
## 3 2009-05-03 ATM1 85
## 4 2009-05-04 ATM1 90
## 5 2009-05-05 ATM1 99
## 6 2009-05-06 ATM1 88
## 7 2009-05-07 ATM1 8
## 8 2009-05-08 ATM1 104
## 9 2009-05-09 ATM1 87
## 10 2009-05-10 ATM1 93
## # … with 355 more rows
# From: https://stats.stackexchange.com/questions/494013/control-the-period-for-daily-time-series-in-tsibbles
atm1_dcmp <- atm1_data_ts %>%
model(stl = STL(Cash ~ trend(window=Inf) + season(period=7, window="periodic")))
atm1_dcmp %>% components() %>% autoplot()
components(atm1_dcmp) %>%
as_tsibble() %>%
filter_index("2009-05-01" ~ "2010-04-30") %>%
autoplot(Cash, colour = "gray") +
geom_line(aes(y=season_adjust), colour = "#0072B2") +
labs(y = "Cash (in hundreds $USD)", title = "Seasonally Adjusted Trendline")
components(atm1_dcmp) %>%
as_tsibble() %>%
filter_index("2010-02-16" ~ "2010-04-30") %>%
autoplot(Cash, colour = "gray") +
geom_line(aes(y=season_adjust), colour = "#0072B2") +
labs(y = "Cash (in hundreds $USD)", title = "Seasonally Adjusted Trendline")
# from textbook: If the seasonal component is removed from the original data, the resulting values are the “seasonally adjusted” data.
Above plot shows that the seasonally adjusted data does not account for the weekly seasonal nature for the last three months. There is a shift.
atm1_data_ts %>% ACF() %>% autoplot()
atm1_data_ts %>% PACF() %>% autoplot()
atm1_data_ts %>% gg_season(Cash, period = "week") +
theme(legend.position = "none") +
labs(y="$USD (in hundreds)", title="ATM Withdrawals")
atm1_data_ts %>%
gg_subseries(period = "week") +
labs(
y = "$USD (in hundreds)",
title = "ATM Withdrawals"
)
# Create training set (assume 1 year of data)
# 292 days for training, approx. 80% of data
train_atm1 <- atm1_data_ts %>%
filter_index("2009-05-01" ~ "2010-02-17")
train_atm1
## # A tsibble: 293 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 96
## 2 2009-05-02 ATM1 82
## 3 2009-05-03 ATM1 85
## 4 2009-05-04 ATM1 90
## 5 2009-05-05 ATM1 99
## 6 2009-05-06 ATM1 88
## 7 2009-05-07 ATM1 8
## 8 2009-05-08 ATM1 104
## 9 2009-05-09 ATM1 87
## 10 2009-05-10 ATM1 93
## # … with 283 more rows
# Fit the models
fit_atm1 <- train_atm1 %>%
model(
Naive = NAIVE(Cash),
`Seasonal naive` = SNAIVE(Cash),
`Random walk` = RW(Cash ~ drift()),
Arima = ARIMA(Cash),
ETS_Add = ETS(Cash ~ error("A") + trend("N") + season("A")),
ETS_Mult = ETS(Cash ~ error("M") + trend("N") + season("M")),
ETS_Damp = ETS(Cash ~ error("M") + trend("Ad") + season("M"))
)
fit_atm1_arima <- train_atm1 %>%
model(ARIMA(Cash))
fit_atm1 %>% accuracy()
## # A tibble: 7 × 11
## ATM .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1 Naive Train… 2.84e- 1 49.9 37.4 -85.0 120. 2.07 1.77 -0.356
## 2 ATM1 Seasonal … Train… 2.03e- 1 28.1 18.0 -83.3 103. 1 1 0.0957
## 3 ATM1 Random wa… Train… -2.13e-16 49.9 37.5 -85.7 121. 2.08 1.77 -0.356
## 4 ATM1 Arima Train… 1.79e- 1 21.9 14.3 -66.5 82.7 0.790 0.780 0.00341
## 5 ATM1 ETS_Add Train… 1.75e- 1 21.4 13.7 -73.3 89.7 0.761 0.763 0.0940
## 6 ATM1 ETS_Mult Train… 1.98e+ 0 21.6 13.7 -75.8 90.6 0.760 0.769 0.0956
## 7 ATM1 ETS_Damp Train… 1.46e+ 0 21.3 13.3 -76.4 90.4 0.737 0.759 0.0756
# Check the residuals of Seasonal Naive
fit_atm1_arima %>% gg_tsresiduals()
# Generate forecasts for 72 days
fc_atm1 <- fit_atm1 %>% forecast(h = "72 days")
fc_atm1 %>% accuracy(atm1_data_ts)
## # A tibble: 7 × 11
## .model ATM .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Arima ATM1 Test -2.20 50.8 37.8 -466. 502. 2.09 1.81 -0.0565
## 2 ETS_Add ATM1 Test -3.25 52.4 38.0 -479. 514. 2.11 1.86 0.00348
## 3 ETS_Damp ATM1 Test -10.4 54.2 40.0 -517. 547. 2.22 1.93 -0.0246
## 4 ETS_Mult ATM1 Test -7.03 52.9 39.2 -494. 527. 2.17 1.88 -0.00808
## 5 Naive ATM1 Test -98.2 104. 98.2 -893. 893. 5.44 3.71 -0.0585
## 6 Random walk ATM1 Test -109. 114. 109. -944. 944. 6.02 4.07 0.0162
## 7 Seasonal naive ATM1 Test -5.21 64.1 53.7 -460. 509. 2.97 2.28 0.00878
# Plot forecasts against actual values
fc_atm1 %>%
autoplot(filter_index(train_atm1, "2010-02-18" ~ "2010-04-30"), level = NULL) +
autolayer(
filter_index(atm1_data_ts, "2010-02-18" ~ "2010-04-30"),
colour = "black"
) +
labs(
y = "Cash (in hundreds $USD)",
title = "Forecasts for ATM1 Withdrawals"
) +
guides(colour = guide_legend(title = "Forecast"))
RMSE of ARIMA model on test data: 21.91975, next best is RMSE of Seasonal Naive 28.11888.
The forecasts don’t line up with the actual data well. The seasonal nature of the actual data doesn’t match the seasonal nature of the forecasted data. Further investigation is needed.
### REMOVE THIS SECTION, AS I'VE PUT ALL THE MODELS INTO ONE INITIAL ATTEMPT #####
# ETS
fit_atm1_ets <- train_atm1 %>%
model(
add = ETS(Cash ~ error("A") + trend("N") + season("A")),
mult = ETS(Cash ~ error("M") + trend("N") + season("M")),
damp = ETS(Cash ~ error("M") + trend("Ad") + season("M"))
)
# Forecast for 73 days
fc_atm1_ets <- fit_atm1_ets %>% forecast(h = "73 days")
fc_atm1_ets %>%
autoplot(filter_index(train_atm1, "2009-12-01" ~ "2010-04-30"), level = NULL) +
autolayer(
filter_index(atm1_data_ts, "2009-12-01" ~ "2010-04-30"),
colour = "black"
) +
labs(y="Billions $USD", title="GDP: China") +
guides(colour = guide_legend(title = "Forecast"))
# ATM1 train on 2010-02-16 to 2010-04-30 (end)
# 15 + 31 + 30 = 76 days, 80% is 61 days
new_seasonal_atm1 <- atm1_data_ts %>%
filter_index("2010-02-16" ~ "2010-04-30")
train_atm1 <- atm1_data_ts %>%
filter_index("2010-02-16" ~ "2010-04-14")
# Consider Box-Cox
lambda <- new_seasonal_atm1 %>%
features(Cash, features = guerrero) %>%
pull(lambda_guerrero)
new_seasonal_atm1 %>%
autoplot(box_cox(Cash, lambda)) +
labs(y = "",
title = latex2exp::TeX(paste0(
"Transformed ATM1 withdrawals with $\\lambda$ = ",
round(lambda,2))))
Applying Box-Cox, the SNaive RMSE went up to 10.08937, without Box-Cox 9.975
# Fit the models
fit_atm1 <- train_atm1 %>%
model(
`Seasonal naive` = SNAIVE(Cash),
Arima = ARIMA(Cash),
ETS_Add = ETS(Cash ~ error("A") + trend("N") + season("A")),
ETS_Mult = ETS(Cash ~ error("M") + trend("N") + season("M")),
ETS_Damp = ETS(Cash ~ error("M") + trend("Ad") + season("M"))
)
report(fit_atm1)
## # A tibble: 5 × 12
## ATM .model sigma2 log_lik AIC AICc BIC ar_roots ma_roots MSE AMSE
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list> <dbl> <dbl>
## 1 ATM1 Seaso… 575. NA NA NA NA <NULL> <NULL> NA NA
## 2 ATM1 Arima 311. -219. 447. 448. 455. <cpl [9… <cpl [0… NA NA
## 3 ATM1 ETS_A… 327. -281. 582. 586. 602. <NULL> <NULL> 276. 330.
## 4 ATM1 ETS_M… 0.155 -308. 636. 641. 657. <NULL> <NULL> 1046. 1437.
## 5 ATM1 ETS_D… 0.405 -329. 683. 692. 710. <NULL> <NULL> 27524. 41523.
## # … with 1 more variable: MAE <dbl>
fit_atm1 %>% accuracy()
## # A tibble: 5 × 11
## ATM .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1 Seasonal naive Trai… -5.20 24.3 15.9 -214. 230. 1 1 0.444
## 2 ATM1 Arima Trai… 0.138 16.0 11.9 -97.6 143. 0.753 0.660 -0.127
## 3 ATM1 ETS_Add Trai… -1.19 16.6 11.7 -56.5 68.7 0.739 0.684 0.0722
## 4 ATM1 ETS_Mult Trai… -2.40 32.3 20.8 -125. 145. 1.31 1.33 0.480
## 5 ATM1 ETS_Damp Trai… -10.4 166. 61.7 -13.8 68.2 3.89 6.82 0.412
fit_atm1_arima<- train_atm1 %>%
model(ARIMA(Cash))
# Check the residuals of Seasonal Naive
fit_atm1_arima %>% gg_tsresiduals()
# Generate forecasts for 16 days (two weeks), so we'll see if it picks up the seasonal nature
fc_atm1 <- fit_atm1 %>% forecast(h = "16 days")
fc_atm1 %>% accuracy(new_seasonal_atm1)
## # A tibble: 5 × 11
## .model ATM .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Arima ATM1 Test 7.71 12.8 9.50 40.9 43.2 0.599 0.527 0.0154
## 2 ETS_Add ATM1 Test 1.06 11.8 8.88 5.50 15.3 0.560 0.486 -0.337
## 3 ETS_Damp ATM1 Test 46.6 54.0 46.6 57.3 58.4 2.94 2.22 0.0345
## 4 ETS_Mult ATM1 Test 9.17 15.1 11.0 -2.95 26.2 0.694 0.621 -0.161
## 5 Seasonal naive ATM1 Test 0.125 9.64 7.12 1.26 9.83 0.449 0.397 -0.00791
# Plot forecasts against actual values
fc_atm1 %>%
autoplot(filter_index(train_atm1, "2010-04-01" ~ "2010-04-30"), level = NULL) +
autolayer(
filter_index(new_seasonal_atm1, "2010-04-01" ~ "2010-04-30"),
colour = "black"
) +
labs(
y = "Cash (in hundreds $USD)",
title = "Forecasts for ATM1 Withdrawals"
) +
guides(colour = guide_legend(title = "Forecast"))
# Forecast for May 2010
fit_atm1 <- new_seasonal_atm1 %>%
model(ETS_Add = ETS(Cash ~ error("A") + trend("N") + season("A")),
ARIMA(Cash))
report(fit_atm1)
fit_atm1 %>% accuracy()
# Check the residuals of Seasonal Naive
#fit_atm1 %>% gg_tsresiduals()
# Generate forecasts for 31 days of May 2010
fc_atm1 <- fit_atm1 %>% forecast(h = "31 days")
# Plot forecasts against actual values
fc_atm1 %>%
autoplot(filter_index(new_seasonal_atm1, "2010-04-01" ~ "2010-04-30"), level = NULL) +
autolayer(
filter_index(new_seasonal_atm1, "2010-04-01" ~ "2010-04-30"),
colour = "black"
) +
labs(
y = "Cash (in hundreds $USD)",
title = "Forecasts for ATM1 Withdrawals"
) +
guides(colour = guide_legend(title = "Forecast"))
# ETS: for just the final 76 days also
fit_atm1_ets <- train_atm1 %>%
model(
add = ETS(Cash ~ error("A") + trend("N") + season("A")),
mult = ETS(Cash ~ error("M") + trend("N") + season("M")),
damp = ETS(Cash ~ error("M") + trend("Ad") + season("M"))
)
fit_atm1_ets %>% accuracy()
# Forecast for 16 days
fc_atm1_ets <- fit_atm1_ets %>% forecast(h = "16 days")
fc_atm1_ets %>% accuracy(atm1_data_ts)
fc_atm1_ets %>%
autoplot(filter_index(train_atm1, "2010-02-16" ~ "2010-04-30"), level = NULL) +
autolayer(
filter_index(atm1_data_ts, "2010-02-16" ~ "2010-04-30"),
colour = "black"
) +
labs(y="$USD (in hundreds)", title="ATM1 Withdrawals Forecast") +
guides(colour = guide_legend(title = "Forecast"))
atm2_data_ts <- atm_data_ts %>%
filter(ATM == 'ATM2')
atm2_data_ts %>% autoplot(Cash)
# Calculate median value for ATM2
median <- median(atm2_data_ts$Cash, na.rm=TRUE)
# Set NAs to median
atm2_data_ts$Cash[is.na(atm2_data_ts$Cash)] <- median
atm2_data_ts
## # A tsibble: 365 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM2 107
## 2 2009-05-02 ATM2 89
## 3 2009-05-03 ATM2 90
## 4 2009-05-04 ATM2 55
## 5 2009-05-05 ATM2 79
## 6 2009-05-06 ATM2 19
## 7 2009-05-07 ATM2 2
## 8 2009-05-08 ATM2 103
## 9 2009-05-09 ATM2 107
## 10 2009-05-10 ATM2 118
## # … with 355 more rows
atm2_data_ts %>%
model(stl = STL(Cash ~ trend(window=Inf) + season(period=7, window="periodic"))
) %>%
components() %>% autoplot()
Same thing as ATM1, the seasonal nature of the data changes in the final few months. I’m gonna separate the forecast into just the final section where the weekly seasonal nature shifts.
atm2_data_ts %>% ACF() %>% autoplot()
atm2_data_ts %>% PACF() %>% autoplot()
atm2_data_ts %>% gg_season(Cash, period = "week") +
theme(legend.position = "none") +
labs(y="$ (in hundreds)", title="ATM Withdrawals")
atm2_data_ts %>%
gg_subseries(period = "week") +
labs(
y = "$ (in hundreds)",
title = "ATM Withdrawals"
)
# From: https://stats.stackexchange.com/questions/494013/control-the-period-for-daily-time-series-in-tsibbles
atm2_dcmp <- atm2_data_ts %>%
model(stl = STL(Cash ~ trend(window=Inf) + season(period=7, window="periodic")))
atm2_dcmp %>% components() %>% autoplot()
components(atm2_dcmp) %>%
as_tsibble() %>%
filter_index("2009-05-01" ~ "2010-04-30") %>%
autoplot(Cash, colour = "gray") +
geom_line(aes(y=season_adjust), colour = "#0072B2") +
labs(y = "Cash (in hundreds)", title = "Seasonally Adjusted Trendline")
components(atm2_dcmp) %>%
as_tsibble() %>%
filter_index("2010-02-16" ~ "2010-04-30") %>%
autoplot(Cash, colour = "gray") +
geom_line(aes(y=season_adjust), colour = "#0072B2") +
labs(y = "Cash (in hundreds)", title = "Seasonally Adjusted Trendline")
# from textbook: If the seasonal component is removed from the original data, the resulting values are the “seasonally adjusted” data.
atm3_data_ts <- atm_data_ts %>%
filter(ATM == 'ATM3')
atm3_data_ts %>% autoplot(Cash)
atm3_data_ts <- atm_data_ts %>%
filter(ATM == 'ATM3', Cash > 0)
atm3_data_ts %>% autoplot(Cash)
summary(atm3_data_ts)
## DATE ATM Cash
## Min. :2010-04-28 Length:3 Min. :82.00
## 1st Qu.:2010-04-28 Class :character 1st Qu.:83.50
## Median :2010-04-29 Mode :character Median :85.00
## Mean :2010-04-29 Mean :87.67
## 3rd Qu.:2010-04-29 3rd Qu.:90.50
## Max. :2010-04-30 Max. :96.00
atm3_data_ts
## # A tsibble: 3 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2010-04-28 ATM3 96
## 2 2010-04-29 ATM3 82
## 3 2010-04-30 ATM3 85
# Literally just 3 values above 0
atm3_data_ts <- atm_data_ts %>%
filter(ATM == 'ATM3')
atm3_data_ts %>% autoplot(Cash)
atm3_data_ts <- atm_data_ts %>%
filter(ATM == 'ATM3', Cash > 0)
atm3_data_ts %>% autoplot(Cash)
summary(atm3_data_ts)
## DATE ATM Cash
## Min. :2010-04-28 Length:3 Min. :82.00
## 1st Qu.:2010-04-28 Class :character 1st Qu.:83.50
## Median :2010-04-29 Mode :character Median :85.00
## Mean :2010-04-29 Mean :87.67
## 3rd Qu.:2010-04-29 3rd Qu.:90.50
## Max. :2010-04-30 Max. :96.00
atm3_data_ts
## # A tsibble: 3 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2010-04-28 ATM3 96
## 2 2010-04-29 ATM3 82
## 3 2010-04-30 ATM3 85
# Literally just 3 values above 0
fit_atm3 <- atm3_data_ts %>% model(MEAN(Cash))
# Forecast for 31 days of May 2010
fc_atm3 <- fit_atm3 %>% forecast(h = "31 days")
#fc_atm1_ets %>% accuracy(atm3_data_ts)
fc_atm3 %>%
autoplot(filter_index(atm3_data_ts, "2010-04-28" ~ "2010-05-31"), level = NULL) +
autolayer(
filter_index(atm1_data_ts, "2010-04-28" ~ "2010-05-31"),
colour = "black"
) +
labs(y="$USD (in hundreds)", title="ATM1 Withdrawals Forecast") +
guides(colour = guide_legend(title = "Forecast"))
There’s only 3 values with data above zero, so I’m assuming there was absolutely no activity but the ATM3 existed, or else it would have NA.
atm4_data_ts <- atm_data_ts %>%
filter(ATM == 'ATM4')
atm4_data_ts %>% autoplot(Cash)
summary(atm4_data_ts)
## DATE ATM Cash
## Min. :2009-05-01 Length:365 Min. : 1.563
## 1st Qu.:2009-07-31 Class :character 1st Qu.: 124.334
## Median :2009-10-30 Mode :character Median : 403.839
## Mean :2009-10-30 Mean : 474.043
## 3rd Qu.:2010-01-29 3rd Qu.: 704.507
## Max. :2010-04-30 Max. :10919.762
atm4_data_ts %>%
model(stl = STL(Cash ~ trend(window=Inf) + season(period=7, window="periodic"))
) %>%
components() %>% autoplot()
atm4_data_ts
## # A tsibble: 365 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM4 777.
## 2 2009-05-02 ATM4 524.
## 3 2009-05-03 ATM4 793.
## 4 2009-05-04 ATM4 908.
## 5 2009-05-05 ATM4 52.8
## 6 2009-05-06 ATM4 52.2
## 7 2009-05-07 ATM4 55.5
## 8 2009-05-08 ATM4 559.
## 9 2009-05-09 ATM4 904.
## 10 2009-05-10 ATM4 879.
## # … with 355 more rows
median <- median(atm4_data_ts$Cash, na.rm=TRUE)
# Set NAs to median
atm4_data_ts$Cash[atm4_data_ts$Cash > 3000] <- median
atm4_data_ts %>% ACF() %>% autoplot()
atm4_data_ts %>% PACF() %>% autoplot()
atm4_data_ts %>% gg_season(Cash, period = "week") +
theme(legend.position = "none") +
labs(y="$ (in hundreds)", title="ATM Withdrawals")
atm4_data_ts %>%
gg_subseries(period = "week") +
labs(
y = "$ (in hundreds)",
title = "ATM Withdrawals"
)
Summary output DATE ATM Cash
Min. :39934 Length:1474 Min. : 0.0
1st Qu.:40026 Class :character 1st Qu.: 0.5
Median :40118 Mode :character Median : 73.0
Mean :40118 Mean : 155.6
3rd Qu.:40210 3rd Qu.: 114.0
Max. :40312 Max. :10919.8
NA’s :19
Date 39934 to 40312
379 dates
ATM1, ATM2, ATM3, ATM4, NA
Cash 19 NA’s
Data observations: ATM1: 3 Cash NA values ATM2: 2 Cash NA values ATM3: Only 3 Cash values with something above zero ATM4: Many Cash values with a decimal, but not all, something weird there, also ATM4 appears to have 1 really crazy outlier Final 14 entries are NA, NA (DATE of 40299 and higher are NA, NA)
Dimensions output [1] 1474 3
So remove the last 14 and all that remains is 1460. And 1460 / 4 is 365, so thus one year.
Total by day and see if there is an overall relationship to be understood
# Calculate median value for ATM
# Straightforward approach to impute data
median <- median(atm_data_ts$Cash, na.rm=TRUE)
# Set NAs to median
atm_data_ts$Cash[is.na(atm_data_ts$Cash)] <- median
# Get rid of outlier from ATM4
atm_data_ts$Cash[atm_data_ts$Cash > 5000] <- median
atm_data_ts %>% autoplot(Cash)
atm_data_total_by_day <- atm_data_ts %>%
index_by(DATE) %>%
summarize(Cash_Total = sum(Cash))
atm_data_total_by_day
## # A tsibble: 379 x 2 [1D]
## DATE Cash_Total
## <date> <dbl>
## 1 2009-05-01 980.
## 2 2009-05-02 695.
## 3 2009-05-03 968.
## 4 2009-05-04 1053.
## 5 2009-05-05 231.
## 6 2009-05-06 159.
## 7 2009-05-07 65.5
## 8 2009-05-08 766.
## 9 2009-05-09 1098.
## 10 2009-05-10 1090.
## # … with 369 more rows
atm_data_total_by_day %>% autoplot(Cash_Total)
atm_data_total_by_day %>%
model(stl = STL(Cash_Total ~ trend(window=Inf) + season(period=7, window="periodic"))
) %>%
components() %>% autoplot()
# Seasonally adjusted plot
atm_dcmp <- atm_data_total_by_day %>%
model(stl = STL(Cash_Total ~ trend(window=Inf) + season(period=7, window="periodic")))
atm_dcmp %>% components() %>% autoplot()
components(atm_dcmp) %>%
as_tsibble() %>%
autoplot(Cash_Total, colour = "gray") +
geom_line(aes(y=season_adjust), colour = "#0072B2") +
labs(y = "Cash (in hundreds)", title = "Seasonally Adjusted Trendline")
Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.
# Read in data
power_data_raw <- read_excel("data/ResidentialCustomerForecastLoad-624.xlsx")
# Change column name to 'Month'
names(power_data_raw)[names(power_data_raw) == 'YYYY-MMM'] <- 'Month'
# Display first 5 rows for visual inspection
head(power_data_raw)
## # A tibble: 6 × 3
## CaseSequence Month KWH
## <dbl> <chr> <dbl>
## 1 733 1998-Jan 6862583
## 2 734 1998-Feb 5838198
## 3 735 1998-Mar 5420658
## 4 736 1998-Apr 5010364
## 5 737 1998-May 4665377
## 6 738 1998-Jun 6467147
# Display summary for initial assessment
summary(power_data_raw)
## CaseSequence Month KWH
## Min. :733.0 Length:192 Min. : 770523
## 1st Qu.:780.8 Class :character 1st Qu.: 5429912
## Median :828.5 Mode :character Median : 6283324
## Mean :828.5 Mean : 6502475
## 3rd Qu.:876.2 3rd Qu.: 7620524
## Max. :924.0 Max. :10655730
## NA's :1
# Display dimensions for assessment
# should be 16 years of data 1998-2013
# Yep, 192/12 = 16
dim(power_data_raw)
## [1] 192 3
# 192 3
# KWH
power_data_ts <- power_data_raw %>%
mutate(Month = yearmonth(Month)) %>%
mutate(KWH = KWH/1e3) %>% # In thousands
as_tsibble(index = Month)
# Output first 5 rows of tsibble
head(power_data_ts)
## # A tsibble: 6 x 3 [1M]
## CaseSequence Month KWH
## <dbl> <mth> <dbl>
## 1 733 1998 Jan 6863.
## 2 734 1998 Feb 5838.
## 3 735 1998 Mar 5421.
## 4 736 1998 Apr 5010.
## 5 737 1998 May 4665.
## 6 738 1998 Jun 6467.
# Inital plot of data
power_data_ts %>%
autoplot(KWH)
power_data_ts %>%
filter_index("2010" ~ "2011") %>%
print()
## # A tsibble: 13 x 3 [1M]
## CaseSequence Month KWH
## <dbl> <mth> <dbl>
## 1 877 2010 Jan 9397.
## 2 878 2010 Feb 8391.
## 3 879 2010 Mar 7348.
## 4 880 2010 Apr 5776.
## 5 881 2010 May 4919.
## 6 882 2010 Jun 6696.
## 7 883 2010 Jul 771.
## 8 884 2010 Aug 7923.
## 9 885 2010 Sep 7819.
## 10 886 2010 Oct 5876.
## 11 887 2010 Nov 4801.
## 12 888 2010 Dec 6153.
## 13 889 2011 Jan 8395.
First observations, data is Monthly, so I’d expect a seasonal component. 1 month is missing KWH has 1 NA value (2008 Sep) Considering imputing with median Sep Value Outlier (with very small value in July 2010) Considering imputing with median Jul Value
power_data_ts
## # A tsibble: 192 x 3 [1M]
## CaseSequence Month KWH
## <dbl> <mth> <dbl>
## 1 733 1998 Jan 6863.
## 2 734 1998 Feb 5838.
## 3 735 1998 Mar 5421.
## 4 736 1998 Apr 5010.
## 5 737 1998 May 4665.
## 6 738 1998 Jun 6467.
## 7 739 1998 Jul 8915.
## 8 740 1998 Aug 8607.
## 9 741 1998 Sep 6990.
## 10 742 1998 Oct 6346.
## # … with 182 more rows
library(stringr)
# I want all the September values
sep_data <- power_data_ts %>%
filter(str_detect(Month, "Sep"))
sep_data <- as_tibble(sep_data)
sep_kwh_med <- sep_data %>%
summarise(Median = median(KWH, na.rm = TRUE))
sep_kwh_med
## # A tibble: 1 × 1
## Median
## <dbl>
## 1 7667.
# Median 7666.97
#power_data_ts[power_data_ts$Month == "2008 Sep"] <- sep_kwh_med
power_data_ts[129, 3] <- sep_kwh_med
power_data_ts
## # A tsibble: 192 x 3 [1M]
## CaseSequence Month KWH
## <dbl> <mth> <dbl>
## 1 733 1998 Jan 6863.
## 2 734 1998 Feb 5838.
## 3 735 1998 Mar 5421.
## 4 736 1998 Apr 5010.
## 5 737 1998 May 4665.
## 6 738 1998 Jun 6467.
## 7 739 1998 Jul 8915.
## 8 740 1998 Aug 8607.
## 9 741 1998 Sep 6990.
## 10 742 1998 Oct 6346.
## # … with 182 more rows
# I want all the July values
jul_data <- power_data_ts %>%
filter(str_detect(Month, "Jul"))
jul_data <- as_tibble(jul_data)
jul_kwh_med <- jul_data %>%
summarise(Median = median(KWH, na.rm = TRUE))
jul_kwh_med
## # A tibble: 1 × 1
## Median
## <dbl>
## 1 7679.
# Median 7678.623
power_data_ts[151, 3] <- jul_kwh_med
power_data_ts %>% autoplot(KWH)
Perhaps a Box-Cox is needed here, there does seem to be trending upward
# STL
power_data_ts %>%
model(stl = STL(KWH ~ trend(window=Inf) + season(period=12, window="periodic"))) %>%
components() %>% autoplot()
power_dcmp <- power_data_ts %>%
model(stl = STL(KWH ~ trend(window=Inf) + season(period=12, window="periodic")))
power_dcmp %>% components() %>% autoplot()
components(power_dcmp) %>%
as_tsibble() %>%
autoplot(KWH, colour = "gray") +
geom_line(aes(y=season_adjust), colour = "#0072B2") +
labs(y = "KWH (in thousands)", title = "Seasonally Adjusted Trendline")
# from textbook: If the seasonal component is removed from the original data, the resulting values are the “seasonally adjusted” data.
Seasonally adjusted doesn’t look too bad
# Seasonal differencing
power_data_ts %>%
gg_tsdisplay(difference(KWH, 12),
plot_type='partial', lag=36) +
labs(title="Seasonally differenced", y="")
# ARIMA model #power_data_ts %>% gg_tsdisplay()
arima_mod_power <- power_data_ts %>% model(ARIMA(KWH, stepwise=F, approx=F))
report(arima_mod_power)
arima_mod_power %>% gg_tsresiduals(lag=36)
forecast(arima_mod_power, h=36) %>% autoplot(power_data_ts) + labs(title = “KWH Used”, y=“KWH (in thousands)”)
# Forecasting
# ATM1 train on 2010-02-16 to 2010-04-30 (end)
# 15 + 31 + 30 = 76 days, 80% is 61 days
# Create training set (365 total, perhaps assume 1 year of data)
# 292 days, yes, using the generating dates based on the integer values
# Train on 154 months, which is 12 years and 8 months (80% of the total)
# Total is 192 months (16 years)
train_power_data <- power_data_ts %>%
filter_index("1998-Jan" ~ "2010-Oct")
lambda <- power_data_ts %>%
features(KWH, features = guerrero) %>%
pull(lambda_guerrero)
train_power_data %>%
autoplot(box_cox(KWH, lambda)) +
labs(y = "",
title = latex2exp::TeX(paste0(
"Transformed ATM1 withdrawals with $\\lambda$ = ",
round(lambda,2))))
# Fit the models
fit_power <- train_power_data %>%
model(
Naive = NAIVE(box_cox(KWH, lambda)),
`Seasonal naive` = SNAIVE(box_cox(KWH, lambda)),
`Random walk` = RW(box_cox(KWH, lambda)),
Arima_BC = ARIMA(box_cox(KWH, lambda)),
Arima = ARIMA(KWH)
)
fit_power$Arima_BC
## <lst_mdl[1]>
## [1] <ARIMA(1,0,1)(0,1,1)[12] w/ drift>
fit_power$Arima
## <lst_mdl[1]>
## [1] <ARIMA(0,0,3)(0,1,1)[12] w/ drift>
#fit_atm1_snaive <- train_atm1 %>%
# model(SNAIVE(Cash))
fit_power %>% accuracy()
## # A tibble: 5 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Naive Training -6.45 1289. 1069. -2.21 17.2 1.92 1.80 0.182
## 2 Seasonal naive Training 69.0 715. 557. 0.478 8.79 1 1 0.267
## 3 Random walk Training -6.45 1289. 1069. -2.21 17.2 1.92 1.80 0.182
## 4 Arima_BC Training 7.10 518. 397. -0.309 6.34 0.713 0.725 0.0885
## 5 Arima Training -11.7 499. 383. -0.687 6.16 0.688 0.698 -0.00393
# Check the residuals of Seasonal Naive
#fit_atm1_snaive %>% gg_tsresiduals()
# Generate forecasts for 38 months, so we'll see if it picks up the seasonal nature
fc_power <- fit_power %>% forecast(h = "38 months")
fc_power %>% accuracy(power_data_ts)
## # A tibble: 5 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Arima Test 661. 1068. 777. 7.64 9.57 1.40 1.49 0.224
## 2 Arima_BC Test 621. 1034. 757. 7.25 9.42 1.36 1.45 0.229
## 3 Naive Test -1348. 2580. 2155. -23.4 32.0 3.87 3.61 0.689
## 4 Random walk Test -1348. 2580. 2155. -23.4 32.0 3.87 3.61 0.689
## 5 Seasonal naive Test 303. 1018. 850. 2.77 11.0 1.53 1.42 0.419
# Plot forecasts against actual values
fc_power %>%
autoplot(power_data_ts, level = NULL) +
autolayer(
filter_index(power_data_ts, "1998-Jan" ~ "2010-Oct"),
colour = "black"
) +
labs(
y = "KWH (in thousands)",
title = "Power Forecast"
) +
guides(colour = guide_legend(title = "Forecast"))
# Result: Not good, SNAIVE at least attempts the seasonal nature
ARIMA_BC is the best RMSE Need to try the ETS
Part C consists of two data sets. These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows). Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file.
# Read in data
pipe1_data_raw <- read_excel("data/Waterflow_Pipe1.xlsx", col_types = c("date", "numeric"))
pipe2_data_raw <- read_excel("data/Waterflow_Pipe2.xlsx", col_types = c("date", "numeric"))
# From: https://stackoverflow.com/questions/53635818/convert-datetime-from-excel-to-r
pipe1_data_raw$`Date Time` <- as.POSIXct(pipe1_data_raw$`Date Time`,
origin="1899-12-30",
tz="GMT")
pipe2_data_raw$`Date Time` <- as.POSIXct(pipe2_data_raw$`Date Time`,
origin="1899-12-30",
tz="GMT")
# Initial output to see data
head(pipe1_data_raw)
## # A tibble: 6 × 2
## `Date Time` WaterFlow
## <dttm> <dbl>
## 1 2015-10-23 00:24:06 23.4
## 2 2015-10-23 00:40:02 28.0
## 3 2015-10-23 00:53:51 23.1
## 4 2015-10-23 00:55:40 30.0
## 5 2015-10-23 01:19:17 6.00
## 6 2015-10-23 01:23:58 15.9
head(pipe2_data_raw)
## # A tibble: 6 × 2
## `Date Time` WaterFlow
## <dttm> <dbl>
## 1 2015-10-23 01:00:00 18.8
## 2 2015-10-23 02:00:00 43.1
## 3 2015-10-23 03:00:00 38.0
## 4 2015-10-23 04:00:00 36.1
## 5 2015-10-23 05:00:00 31.9
## 6 2015-10-23 06:00:00 28.2
summary(pipe1_data_raw)
## Date Time WaterFlow
## Min. :2015-10-23 00:24:06 Min. : 1.067
## 1st Qu.:2015-10-25 11:21:35 1st Qu.:13.683
## Median :2015-10-27 20:07:30 Median :19.880
## Mean :2015-10-27 20:49:15 Mean :19.897
## 3rd Qu.:2015-10-30 08:24:51 3rd Qu.:26.159
## Max. :2015-11-01 23:35:43 Max. :38.913
summary(pipe2_data_raw)
## Date Time WaterFlow
## Min. :2015-10-23 01:00:00 Min. : 1.885
## 1st Qu.:2015-11-02 10:45:00 1st Qu.:28.140
## Median :2015-11-12 20:30:00 Median :39.682
## Mean :2015-11-12 20:30:00 Mean :39.556
## 3rd Qu.:2015-11-23 06:15:00 3rd Qu.:50.782
## Max. :2015-12-03 16:00:00 Max. :78.303
dim(pipe1_data_raw)
## [1] 1000 2
dim(pipe2_data_raw)
## [1] 1000 2
atm_data_ts <- atm_data_raw %>% as_tsibble(index = DATE, key = ATM)
atm_data_ts ```
#30#